{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\ProgramData\\Anaconda3\\lib\\site-packages\\pandas\\core\\computation\\expressions.py:183: UserWarning: evaluating in Python space because the '*' operator is not supported by numexpr for the bool dtype, use '&' instead\n",
      "  .format(op=op_str, alt_op=unsupported[op_str]))\n"
     ]
    }
   ],
   "source": [
    "# -*- coding:utf-8 -*-\n",
    "#  数据预处理\n",
    "\n",
    "from __future__ import division\n",
    "from pandas import DataFrame,Series\n",
    "import pandas as pd\n",
    "\n",
    "datafile = 'air_data.csv'#航空公司原始数据，第一行是属性名\n",
    "data = pd.read_csv(datafile, encoding='utf-8')\n",
    "\n",
    "# 1> 数据清洗 \n",
    "# 丢弃掉票价为0的记录；丢弃票价为0、平均折扣不为零、总飞行公里大于0的记录\n",
    "\n",
    "cleanedfile = 'cleaned.xlsx'\n",
    "\n",
    "data1 = data[data['SUM_YR_1'].notnull()*data['SUM_YR_2'].notnull()] #票价非空值才保留,去掉空值\n",
    "\n",
    "#只保留票价非零的，或者平均折扣率与总飞行公里数同时为零的记录\n",
    "index1 = data1['SUM_YR_1'] != 0\n",
    "index2 = data1['SUM_YR_2'] != 0\n",
    "index3 = (data1['SEG_KM_SUM'] == 0) & (data1['avg_discount'] == 0)\n",
    "data1 = data1[index1 | index2 | index3] #或关系\n",
    "\n",
    "data1.to_excel(cleanedfile)\n",
    "data2 = data1[['LOAD_TIME','FFP_DATE','LAST_TO_END','FLIGHT_COUNT','SEG_KM_SUM','avg_discount']]\n",
    "data2.to_excel('datadecrese.xlsx')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0   2706 days\n",
      "1   2597 days\n",
      "2   2615 days\n",
      "3   2047 days\n",
      "4   1816 days\n",
      "Name: L1, dtype: timedelta64[ns]\n",
      "0    88.905316\n",
      "1    85.324134\n",
      "2    85.915522\n",
      "3    67.253948\n",
      "4    59.664469\n",
      "Name: L3, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "# 2> 数据规约 \n",
    "import numpy as np\n",
    "data = pd.read_excel('datadecrese.xlsx')\n",
    "\n",
    "data['L1'] = pd.to_datetime(data['LOAD_TIME']) - pd.to_datetime(data['FFP_DATE'])# 以纳秒为单位\n",
    "print(data['L1'].head())\n",
    "# data['L3'] = data['L1'].astype('int64')/10**10/8640/30 # 此方法假定每个月是30天，这方法不准确\n",
    "data['L3'] = data['L1']/np.timedelta64(1, 'M') # 将间隔时间转成月份为单位，注意，此处必须加一个中间变量 （****）\n",
    "print(data['L3'].head())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "      R    F       M     C      L\n",
      "0  0.03  210  580717  0.96  88.91\n",
      "1  0.23  140  293678  1.25  85.32\n",
      "2  0.37  135  283712  1.25  85.92\n",
      "3  3.23   23  281336  1.09  67.25\n",
      "4  0.17  152  309928  0.97  59.66\n"
     ]
    }
   ],
   "source": [
    "# 将表中的浮点类型保留至小数点后四为\n",
    "# f = lambda x:'%.2f' % x\n",
    "# data[['L3']]  = data[['L3']].applymap(f) # or data['L3'] = data['L3'].apply(f)\n",
    "# data[['L3']]  = data[['L3']].astype('float64')# 注意:使用apply或applymap后，数据类型变成Object,若后续有需要需要在此进行类型转换\n",
    "\n",
    "data[\"L3\"] = data[\"L3\"].round(2) # 等价于上面三句话，数据类型不变\n",
    "data['LAST_TO_END'] = (data['LAST_TO_END']/30).round(2)\n",
    "data['avg_discount'] = data['avg_discount'].round(2)\n",
    "\n",
    "data.drop('L1', axis=1, inplace =True) # 删除中间变量\n",
    "data.drop(data.columns[:3], axis=1, inplace =True) # 去掉不需要的u'LOAD_TIME', u'FFP_DATE'\n",
    "data.rename(columns={'LAST_TO_END':'R','FLIGHT_COUNT':'F','SEG_KM_SUM':'M','avg_discount':'C','L3':'L'},inplace=True)\n",
    "print(data.head())\n",
    "data.to_excel('sxgz.xlsx',index=False)\n",
    "\n",
    "def f(x):\n",
    "    return Series([x.min(),x.max()], index=['min','max'])\n",
    "d = data.apply(f)\n",
    "d.to_excel('summary_data.xlsx')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 3> 数据标准化\n",
    "#标准差标准化\n",
    "d1 = pd.read_excel('sxgz.xlsx')\n",
    "d1=d1.astype('float64')\n",
    "d2 = (d1-d1.mean())/d1.std()\n",
    "d1 =d2.iloc[:,[4,0,1,2,3]]\n",
    "d1.columns = ['Z'+i for i in d1.columns]#表头重命名\n",
    "d1.to_excel('sjbzh.xlsx',index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
